Neural composer

In [ ]:
!sudo dpkg -i ~/tensorflow/personal-scratch/kake/cudnn/libcudnn7_7.3.1.20-1+cuda9.0_amd64.deb
!pip install -I --user tensorflow-gpu pretty_midi pypianoroll pandas
In [1]:
import IPython
import pretty_midi
import numpy as np
from time import time
import matplotlib.pyplot as plt
import pypianoroll as pproll
import tensorflow as tf
import pandas as pd
import numpy as np

tf.enable_eager_execution()
%matplotlib inline

Problem formulation

Task framed as an autoregressive multilabel classification (That is, at each time step do a a multilabel classification conditioned on the classification choices done already).

While this is quite straight forward, there is a discrepancy between training (next step prediction) and the true objective (long term generation).

Ended up doing it anyway, at the end are more information on other techniques attempted, and some not attempted.

In [2]:
BATCH_SIZE = 32
NUM_LAYERS = 2
RNN_SIZE = 256
FEATURE_SIZE = 128
EPSILON = 1e-5
L2_WEIGHT = .001
EPOCHS = 20000
PIANOROLL_PATH = 'songs/*.csv'
COMPOSERS = ['mz', 'br', 'de', 'ba']
SAVE_DIR = 'composer/'

Dataset preprocessing:

  • Assume pretty_midi piano_roll matrices
  • Assume first two letters of file name identifies composer
  • Remove first column and first row (both are indices) and add final marker (all keys 128)
  • Batch into mini batches of BATCH_SIZE songs
  • Keep record of each matrix length and pad all matrices to longest matrix in mini batch
  • Return mini-batches of shape BATCH_SIZE x NUM_KEYS x MAX_TIME_STEPS
  • Iterate over songs EPOCHS times

As the dataset is very small, we don't bother with truncated backpropagation and cache the pre batched matrices in memory.

In [3]:
with tf.device('cpu:0'):
    mapping = tf.contrib.lookup.index_table_from_tensor(tf.constant(COMPOSERS))

    def parse_numeric_csv(dataset):
        ''' numeric CSV parser '''
        return (dataset
                .map(lambda x: tf.sparse.to_dense(tf.string_split([x], '\n'), '')[0])
                .map(lambda x: tf.sparse.to_dense(tf.string_split(x, ','), '0'))
                .map(tf.strings.to_number))

    def get_composer_ids(x):
        ''' filename to composer ids '''
        file_name = tf.sparse.to_dense(tf.strings.split([x], '/'), '')[0, -1]
        prefix = tf.strings.substr(file_name, 0, 2)
        ids = mapping.lookup(prefix)
        return ids

    paths = tf.data.Dataset.list_files(PIANOROLL_PATH)

    files = (paths
            .map(tf.read_file)
            .apply(parse_numeric_csv)
            .map(lambda x: tf.concat([x[1:, 1:], 128. * tf.ones((128, 1))], axis=1))
            .map(lambda x: (x, tf.shape(x)[-1])))

    composers = paths.map(get_composer_ids)

    ds = tf.data.Dataset.zip((composers, files))
WARNING:tensorflow:From /home/finn/.local/lib/python3.6/site-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
In [4]:
ds = (ds.cache()
        .repeat(EPOCHS)
        .padded_batch(BATCH_SIZE, padded_shapes=((), ([FEATURE_SIZE, -1], ()))))

ds = ds.apply(tf.data.experimental.prefetch_to_device('gpu:0'))

Model

  • Problem formulated as Autoregressive multiple classification problem
  • Solved with a Long-Short Term Memory recurrent neural network
  • For each time step independently classify 128 keys using the cross entropy between model prediction and active keys at next time step
  • Initialise LSTM rollout with a general trainable initialisation or an independent initialisation per composer
In [5]:
class BinarizedNeuralComposer(tf.keras.Model):
    def __init__(self, rnn_size, feature_size, composers):
        super(BinarizedNeuralComposer, self).__init__()
        self.rnn_size = rnn_size
        self.feature_size = feature_size
        self.composers = composers

        self.rnn = tf.contrib.cudnn_rnn.CudnnLSTM(NUM_LAYERS, self.rnn_size, dropout=0.2)
        self.comp_emb_c = tf.get_variable(
            'comp_emb_c',(len(self.composers) + 1, NUM_LAYERS, self.rnn_size))
        self.comp_emb_h = tf.get_variable(
            'comp_emb_h',(len(self.composers) + 1, NUM_LAYERS, self.rnn_size))
        self.projection = tf.layers.Dense(self.feature_size)

    def call(self, data, comp_id=-1, state=None, training=True):
        if state is None:
            h = tf.nn.embedding_lookup(params=self.comp_emb_h, ids=comp_id)
            c = tf.nn.embedding_lookup(params=self.comp_emb_c, ids=comp_id) # BATCH x LAYERS x RNN_SIZE
            h = tf.transpose(h, [1, 0, 2])
            c = tf.transpose(c, [1, 0, 2])
            state = (h, c)
        
        out, state = self.rnn(data, state, training=training)
        logits = self.projection(out)

        return logits, state
In [21]:
model = BinarizedNeuralComposer(RNN_SIZE, FEATURE_SIZE, COMPOSERS)
optimizer = tf.train.AdamOptimizer(.00005)

root = tf.train.Checkpoint(optimizer=optimizer,
                           model=model,
                           optimizer_step=tf.train.get_or_create_global_step())

checkpoint = tf.train.latest_checkpoint(SAVE_DIR)
status = root.restore(checkpoint)
print(checkpoint)
composer/finalfinal_2_256-1
In [ ]:
# reg = tf.contrib.layers.l2_regularizer(L2_WEIGHT)

for comp_id, (data, length) in ds:
    begin = time()
    data = tf.transpose(data, [2, 0, 1]) # batch x keys x time -> time x batch x keys
    data = tf.to_float(tf.not_equal(data, tf.zeros_like(data))) # binarize

    x = data[:-1]
    y = data[1:]
    
    seq_length = length - 1

    length_mask = tf.expand_dims(tf.transpose(tf.to_float(tf.sequence_mask(seq_length))), -1)

    # Cool scheduled sampling technique which didn't fix much
    '''
    y_hat, _ = model(x, comp_id)
    x_hat = tf.concat([x[:1], tf.nn.sigmoid(y_hat[:-1])], axis=0)
    x_hat = tf.where(x_hat > .1, tf.ones_like(x_hat), tf.zeros_like(x_hat))
    x_hat = tf.where(tf.random.uniform(tf.shape(x_hat)) > .5, x, x_hat)
    '''
    x_hat = x

    # Train generalist / specialist 50 / 50
    comp_id = tf.where(.5 > tf.random.uniform(tf.shape(comp_id)), comp_id, -1 * tf.ones_like(comp_id))

    with tf.GradientTape() as tape:
        y_hat, _ = model(x_hat, comp_id)

        loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=y, logits=y_hat, weights=length_mask)
        # loss += tf.contrib.layers.apply_regularization(reg, [model.projection.weights[0], model.comp_emb])
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables),
                              global_step=tf.train.get_or_create_global_step())

    if tf.train.get_or_create_global_step().numpy() % 500 == 0:
        print(tf.train.get_or_create_global_step().numpy(), time() - begin, tf.reduce_sum(loss).numpy())
18500 0.5477919578552246 0.027942197
In [20]:
root.save(SAVE_DIR + 'finalfinal_2_256')
Out[20]:
'composer/finalfinal_2_256-1'
In [8]:
# Helper functions copied from github.com/zehsilva/neural-composer-assignement

def piano_roll_to_pretty_midi(piano_roll, fs=100, program=1):
    '''Convert a Piano Roll array into a PrettyMidi object
     with a single instrument.
    Parameters
    ----------
    piano_roll : np.ndarray, shape=(128,frames), dtype=int
        Piano roll of one instrument
    fs : int
        Sampling frequency of the columns, i.e. each column is spaced apart
        by ``1./fs`` seconds.
    program : int
        The program number of the instrument.
    Returns
    -------
    midi_object : pretty_midi.PrettyMIDI
        A pretty_midi.PrettyMIDI class instance describing
        the piano roll.
    '''
    notes, frames = piano_roll.shape
    pm = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=program)

    # pad 1 column of zeros so we can acknowledge inital and ending events
    piano_roll = np.pad(piano_roll, [(0, 0), (1, 1)], 'constant')

    # use changes in velocities to find note on / note off events
    velocity_changes = np.nonzero(np.diff(piano_roll).T)

    # keep track on velocities and note on times
    prev_velocities = np.zeros(notes, dtype=int)
    note_on_time = np.zeros(notes)

    for time, note in zip(*velocity_changes):
        # use time + 1 because of padding above
        velocity = piano_roll[note, time + 1]
        time = time / fs
        if velocity > 0:
            if prev_velocities[note] == 0:
                note_on_time[note] = time
                prev_velocities[note] = velocity
        else:
            pm_note = pretty_midi.Note(
                velocity=prev_velocities[note],
                pitch=note,
                start=note_on_time[note],
                end=time)
            instrument.notes.append(pm_note)
            prev_velocities[note] = 0
    pm.instruments.append(instrument)
    return pm

def visualize_piano_roll(pianoroll_matrix,fs=5):
    """ input: piano roll matrix with shape (number of notes, time steps)
        effect: generates a nice graph with the piano roll visualization
    """
    if(pianoroll_matrix.shape[0]==128):
        pianoroll_matrix=pianoroll_matrix.T.astype(float)
    track = pproll.Track(pianoroll=pianoroll_matrix, program=0, is_drum=False, name='piano roll')   
    # Plot the piano-roll
    fig, ax = track.plot(beat_resolution=fs)
    plt.show()

Failed attempts and other directions

The model trained very slow, so I tried a few tricks to simplify the problem. I also through a little about making the training objective more similar to the generative use case.

Things tried

  • Full regression on pitch - Couldn't find sweetspot between absolute junk and total memorization.
  • Weight training signal of keys on / off according to frequency per key - Very unstable training
  • Predict number of activated keys at next step and take that many - Didn't train any faster
  • predict 3 steps ahead as auxilliary task - Didn't warrant the extra complexity, and didn't use the predictions for anything at test time
  • Scheduled sampling for recovery (inject predictions during trainig) - Very surprised this didn't work better!

Things that would be fun to try

  • Formulate as a sequence GAN and train as a reinforcement learning policy. Too many moving parts!
In [9]:
# Utils for reading piano roll file and sample songs from model

def show_and_tell(path, composer_ids=-1, length=30, fs=5):
    print('Original song')
    song = read(path)
    render_pianoroll(song[:length * fs,0,:])
    render_audio(song, length)

    print('Generalist')
    composed = compose(song, [-1], start_seconds=5, length_seconds=length - 5, fs=fs).numpy()
    render_pianoroll(composed[:length * fs,0,:])
    render_audio(composed, length)
    
    for composer_id in composer_ids:
        print('Specialist:', COMPOSERS[composer_id])
        composed = compose(song, [composer_id], start_seconds=5, length_seconds=length - 5, fs=fs).numpy()
        render_pianoroll(composed[:length * fs,0,:])
        render_audio(composed, length)

def read(path):
    piano_roll = pd.read_csv(path) # Read CSV
    piano_roll = piano_roll.values[:,1:] # Remove indices
    piano_roll = tf.transpose(piano_roll)
    piano_roll = tf.expand_dims(piano_roll, 1)
    piano_roll = tf.minimum(1., piano_roll) # binarize
    
    return piano_roll.numpy()

def render_pianoroll(song):
    plt.figure(figsize=(15,5))
    plt.imshow(song.transpose(), cmap='hot')
    plt.show()
    visualize_piano_roll(song)

def render_audio(song, length):
    pm = piano_roll_to_pretty_midi(song[:,0,:].transpose(), fs=5)
    signal = pm.synthesize()
    IPython.display.display(IPython.display.Audio(signal[:44100*length], rate=44100))

def compose(song, composer_id, start_seconds=5, length_seconds=20, fs=5):
    seed = song[:fs * start_seconds]

    y_hats, state = model(seed, composer_id, training=False)

    y_hat = y_hats[-1:]
    y_hat = round_to_zero(y_hat)
    result = [seed, y_hat]

    for _ in range(fs * length_seconds):
        y_hat, state = model(y_hat, composer_id, state, training=False)
        y_hat = round_to_zero(y_hat)
        result.append(y_hat)
    return tf.concat(result, axis=0)

def round_to_zero(x, threshold=.1):
        x = tf.nn.sigmoid(x)
        return tf.where(x > threshold, tf.ones_like(x), tf.zeros_like(x))
In [10]:
show_and_tell('songs/bach_847.csv', composer_ids=[COMPOSERS.index('ba'), COMPOSERS.index('de')])
Original song
Generalist
Specialist: ba
Specialist: de
In [12]:
show_and_tell('unseen_songs/debussy_prel.csv', composer_ids=[COMPOSERS.index('ba'), COMPOSERS.index('de')])
Original song
Generalist
Specialist: ba
Specialist: de
In [ ]: